The following packages are used in this project:

library(tidyverse) # For data wrangling
library(readxl) # To import excel and csv files
library(countrycode) # To assign the names of the continents for each country
library(readr) # To export the dataframe
library(ggthemes) # Additional themes for graphics
library(plotly) # To make interactive plots
library(gganimate) # To create animated plots

Introduction

In this project, I will recreate three single year plots for the years 1800, 1950 and 2018 (latest available data) as shown in Hans Rosling’s video here:

life expectancy and wealth, 2009

Additionally, I am going to create an animated plot of the last 50 years of data using gganimate. Finally, I will create an interactive plot with 2018 data using plotly.

Data files

The files used for this project, downloaded from https://www.gapminder.org/data/, can be found in the “data” folder of this project.

There are CSV files for the following:

  • Income: “data/income_per_person_gdppercapita_ppp_inflation_adjusted.csv”

  • Life expectancy (years): “data/life_expectancy_years.csv”

  • Population: “data/population_total.csv”

Importing Data

Importing the 3 datasets we need in CSV

# Importing the datasets and assigning shorter names

# check.names is used so the name of the variable does not change after importing.

inc <- read.csv("data/income_per_person_gdppercapita_ppp_inflation_adjusted.csv", check.names = FALSE)

lif <- read.csv("data/life_expectancy_years.csv", check.names = FALSE)

pop <- read.csv("data/population_total.csv", check.names = FALSE)

In the datasets, the name of the continent for each country is missing. I can get it from another table or using a package called countrycode.

# Adding the continent to the life expectancy table using countrycoude package

lif$continent <- countrycode(sourcevar = lif[, "country"],
                            origin = "country.name",
                            destination = "continent")

Tidy Data

Because the datasets are breaking the rules of tidy data, we need to pivot longer all 3 tables.

life_expectancy_long <- lif %>%
# This columns will stay with the new variables
  pivot_longer(cols = c(-country, -continent),
# This will create the variables year and life_expectancy
               names_to = "year",
               values_to = "life_expectancy")

income_long <- inc %>% 
  pivot_longer(-country,
               names_to = "year",
               values_to = "income")

population_long <- pop %>% 
  pivot_longer(-country,
               names_to = "year",
               values_to = "population")

Joins

Joining the 3 datasets; life expectancy, income and population.

# Using a left join to keep the data of the main table life_expectancy_long
# and remove the projections included in the other 2 tables
allcountries <-
  life_expectancy_long %>%
# This columns are the   
  left_join(income_long, by = c("country", "year"))

allcountries <- allcountries %>% 
  left_join(population_long, by = c("country", "year"))

# Checking the first rows to see the joined data.
head(allcountries)
## # A tibble: 6 x 6
##   country     continent year  life_expectancy income population
##   <chr>       <chr>     <chr>           <dbl>  <int>      <int>
## 1 Afghanistan Asia      1800             28.2    603    3280000
## 2 Afghanistan Asia      1801             28.2    603    3280000
## 3 Afghanistan Asia      1802             28.2    603    3280000
## 4 Afghanistan Asia      1803             28.2    603    3280000
## 5 Afghanistan Asia      1804             28.2    603    3280000
## 6 Afghanistan Asia      1805             28.2    603    3280000

Exporting Data

Exporting the clean dataframe as a CSV for later manipulation with other software.

write_csv(allcountries, "clean_df.csv")

Plotting

Using ggplot

Using basic ggplot functions to create the base plots.

# Assigning a name to the plot
chart1 <- allcountries %>% 
# Filtering the year of interest
    filter(year %in% c("1800")) %>%
# Assigning the variables to the axes and coloring the distinct continents
    ggplot(aes(x = income, y = life_expectancy, colour = continent)) +
# Adding the plots where the bigger the population, the bigger the plots.
# Using alpha to add transparency.
    geom_point(aes(size = population), alpha = .5) + 
# Adding the limits and breaks for the y axis.
# This scale is continuous because age does not change that much overtime.
    scale_y_continuous(
      limits = c(10, 95),
      breaks = c(25, 50, 75)) +
# Adding the limits and breaks for the x axis.
# This scale is logarithmic to better represent the change over time.
    scale_x_log10(
      limits = c(400, 120000),
      breaks = c(400, 4000, 40000))
# Displaying the base plot.
chart1

# Repeating the same process for the years 1950 and 2018.
chart2 <- allcountries %>% 
    filter(year %in% c("1950")) %>%
    ggplot(aes(x = income, y = life_expectancy, colour = continent)) +
    geom_point(aes(size = population), alpha = .5) + 
    scale_y_continuous(
      limits = c(10, 95),
      breaks = c(25, 50, 75)) +
    scale_x_log10(
      limits = c(400, 120000),
      breaks = c(400, 4000, 40000))

chart2

chart3 <- allcountries %>% 
    filter(year %in% c("2018")) %>%
    ggplot(aes(x = income, y = life_expectancy, colour = continent)) +
    geom_point(aes(size = population), alpha = .5) + 
    scale_y_continuous(
      limits = c(10, 95),
      breaks = c(25, 50, 75)) +
    scale_x_log10(
      limits = c(400, 120000),
      breaks = c(400, 4000, 40000))

chart3

Finding Relevant Values

Finding interesting data points to show in our plots, including maximum and minimum values for income and life expectancy.

# For the year 1800
allcountries %>% 
  filter(year %in% c("1800")) %>%
# To see the country with highest life expectancy.
  slice_max(life_expectancy)
## # A tibble: 1 x 6
##   country continent year  life_expectancy income population
##   <chr>   <chr>     <chr>           <dbl>  <int>      <int>
## 1 Iceland Europe    1800             42.9    926      61400
allcountries %>% 
  filter(year %in% c("1800")) %>%
# To see the country with highest income.
  slice_max(income)
## # A tibble: 1 x 6
##   country     continent year  life_expectancy income population
##   <chr>       <chr>     <chr>           <dbl>  <int>      <int>
## 1 Netherlands Europe    1800             39.9   4230    2250000
# For the year 1950

allcountries %>% 
  filter(year %in% c("1950")) %>%
  slice_max(life_expectancy)
## # A tibble: 1 x 6
##   country continent year  life_expectancy income population
##   <chr>   <chr>     <chr>           <dbl>  <int>      <int>
## 1 Norway  Europe    1950             71.6  11400    3270000
allcountries %>% 
  filter(year %in% c("1950")) %>%
  slice_max(income)
## # A tibble: 1 x 6
##   country continent year  life_expectancy income population
##   <chr>   <chr>     <chr>           <dbl>  <int>      <int>
## 1 Brunei  Asia      1950             55.5  56600      48000
allcountries %>% 
  filter(year %in% c("1950")) %>%
  slice_min(life_expectancy)
## # A tibble: 1 x 6
##   country continent year  life_expectancy income population
##   <chr>   <chr>     <chr>           <dbl>  <int>      <int>
## 1 Yemen   Asia      1950             23.8   1340    4400000
# For the year 2018

allcountries %>% 
  filter(year %in% c("2018")) %>%
  slice_max(life_expectancy)
## # A tibble: 1 x 6
##   country continent year  life_expectancy income population
##   <chr>   <chr>     <chr>           <dbl>  <int>      <int>
## 1 Japan   Asia      2018             84.2  39100  127000000
allcountries %>% 
  filter(year %in% c("2018")) %>%
  slice_min(life_expectancy)
## # A tibble: 1 x 6
##   country continent year  life_expectancy income population
##   <chr>   <chr>     <chr>           <dbl>  <int>      <int>
## 1 Lesotho Africa    2018             51.1   2960    2260000

Plots Formatting

Adding themes, changing colors, adding labels and reference lines.

# Saving the final plots
lifeexp1800 <- chart1 +
# Adding labels to show information about the plot, like title, labels.
  labs(title = "Income versus Life Expectancy in 1800",
       x = "Income (GDP per capita in USD $)",
       y = "Life expectancy (years)",
       caption = "Source: Gapminder",
       size = "Population (millions)",
       color = "Continent") +
# Setting the size of the bubbles with the same scale for the 3 plots
  scale_size(
      # The range is the size of the bubbles, higher would mean bigger difference between bubbles.
       range = c(0.1, 15),
      # The limits of population from NA to the highest which is China just under 1500 million.
       limits = c(NA,1500000000), 
      # Multiplying the population by millions so it would be easier to read.
       breaks = 1000000 * c(10, 50, 100, 500, 1000, 1500),
       labels = c("10", "50", "100", "500", "1000", "1500")) +
# Changing the color palette
  scale_colour_brewer(palette = "Set1") +
# Changing the default theme
  theme_classic() +
# Using the override function to make the color label bigger
  guides(color = guide_legend(override.aes = list(size = 5, alpha = .5))) +
# Adding the vertical and horizontal lines to replicate Hans Rosling's plots
  geom_vline(xintercept = c(400, 4000, 40000),
             # Adding a light color, with small size and .5 transparency so it is not distracting.
                color = "grey", size = .2, alpha = .5) +
# Doing the same on the y axes.  
  geom_hline(yintercept = c(25, 50, 75), 
                color = "grey", size = .2, alpha = .5) +
# Labeling the countries with the interesting points we found earlier.
# The life_expectancy has a +5 to show the label higher on the y axes.
  geom_text(aes(x = income, y = life_expectancy + 5, label = country),
               color = "grey50",
            # Filtering the data to show only the 2 countries we want.
               data = filter(allcountries, year == 1800, country %in% c("Iceland", "Netherlands")))

# Showing the result
lifeexp1800

# Repeating the same process for the next 2 plots.
lifeexp1950 <- chart2 + 
  labs(title = "Income versus Life Expectancy in 1950",
       x = "Income (GDP per capita in USD $)",
       y = "Life expectancy (years)",
       caption = "Source: Gapminder",
       size = "Population (millions)",
       color = "Continent") +
  scale_size(
       range = c(0.1, 15),    
       limits = c(NA,1500000000), 
       breaks = 1000000 * c(10, 50, 100, 500, 1000, 1500),
       labels = c("10", "50", "100", "500", "1000", "1500")) +
  scale_colour_brewer(palette = "Set1") +  
  theme_classic() +
  guides(color = guide_legend(override.aes = list(size = 5))) +
  geom_vline(xintercept = c(400, 4000, 40000), 
                color = "grey", size = .2, alpha = .5) +
  geom_hline(yintercept = c(25, 50, 75), 
                color = "grey", size = .2, alpha = .5) +
  geom_text(aes(x = income, y = life_expectancy + 5, label = country),
            color = "grey50",
            data = filter(allcountries, year == 1950, country %in% c("Norway", "Brunei", "Yemen")))

lifeexp1950

lifeexp2018 <- chart3 + 
  labs(title = "Income versus Life Expectancy in 2018",
       x = "Income (GDP per capita in USD $)",
       y = "Life expectancy (years)",
       caption = "Source: Gapminder",
       size = "Population (millions)",
       color = "Continent") +
  scale_size(
       range = c(0.1, 15),    
       limits = c(NA,1500000000), 
       breaks = 1000000 * c(10, 50, 100, 500, 1000, 1500),
       labels = c("10", "50", "100", "500", "1000", "1500")) +
  scale_colour_brewer(palette = "Set1") +
  theme_classic() +
  guides(color = guide_legend(override.aes = list(size = 5))) +
  geom_vline(xintercept = c(400, 4000, 40000), 
                color = "grey", size = .2, alpha = .5) +
  geom_hline(yintercept = c(25, 50, 75), 
                color = "grey", size = .2, alpha = .5) +
  geom_text(aes(x = income, y = life_expectancy + 5, label = country),
                color = "grey50",
                data = filter(allcountries, year == 2018, country %in% c("Japan", "Lesotho")))

lifeexp2018

Exporting Plots as Image Files

Saving the plots as jpg with a widescreen 16:9 ratio.

# Adding the name and image extension of the file you want.
ggsave("lifeexp1800.jpg",
  # Selecting the plot. you want to save.     
  lifeexp1800,
  # Specifying the aspect ratio.
  width = 16, height = 9)

# Repeating the process for the next plots.
ggsave("lifeexp1950.jpg",
  lifeexp1950,
  width = 16, height = 9)

ggsave("lifeexp2018.jpg",
  lifeexp2018,
  width = 16, height = 9)

Creating an Animated Data Visualization

Animation showing the change in life expectancy over the last 50 years.

# Creating a separate dataset for the animation
anim_data <- allcountries %>%
# Changing variable type to integer, because it was a chr and it has to be numeric.  
  mutate(year= as.integer(year)) %>%
# Setting the range of years for the animation  
  filter(year %in% (1968:2018))

# Adding the same format as our previous plots.
anim_output <- ggplot(anim_data, aes(income, life_expectancy, size = population, color = continent, frame = year)) +
  labs(x="Income (GDP per capita in USD $)", 
       y = "Life Expectancy (years)", 
       caption = "Source: Gapminder", 
       size = "Population (millions)",
       color = 'Continent') + 
  scale_y_continuous(
    limits = c(10, 95),
    breaks = c(25, 50, 75)) +
  scale_x_log10(
    limits = c(400, 120000), 
    breaks = c(400, 4000, 40000)) +
  scale_colour_brewer(palette = "Set1") +
  theme_classic() +
  guides(color = guide_legend(override.aes = list(size = 5))) +
  geom_vline(xintercept = c(400, 4000, 40000), 
                color = "grey", size = .2, alpha = .5) +
  geom_hline(yintercept = c(25, 50, 75), 
                color = "grey", size = .2, alpha = .5) +
  geom_point(aes(), alpha = .5) +
  scale_size(
       range = c(0.1, 15),    
       limits = c(NA,1500000000), 
       breaks = 1000000 * c(10, 50, 100, 500, 1000, 1500),
       labels = c("10", "50", "100", "500", "1000", "1500")) +
# gganimate parameters, title with year changing with the plots.
  ggtitle("Income versus Life Expectancy, year: {frame_time}") +
  transition_time(year) +
  ease_aes("linear") +
  enter_fade() +
  exit_fade()
# Animation parameter, duration, frames per second, size and using the gifski renderer for the output..
animate(anim_output, duration = 10, fps = 20, width = 800, height = 400, renderer = gifski_renderer())

# Saving the animation as a GIF
anim_save("capstone_animation.gif")

Interactive Data Visualization

Creating an interactive visualization with information by country.

# Filtering the data to use the latest year
int_data <- allcountries %>% 
  filter(year=="2018")

# Interactive version, using mutate to create new variables to show in the tooltip.
interactive <- int_data %>%
# Mutating and rounding the income to 0 decimals.
  mutate(income=round(income,0)) %>%
# Mutating and dividing the population so it would be easier to read.
  mutate(population=round(population/1000000,2)) %>%
# Mutating life expectancy and rounding with 1 decimal.
  mutate(life_expectancy=round(life_expectancy,1)) %>%
# Reordering the countries
  arrange(desc(population)) %>%
  mutate(country = factor(country, country)) %>%
# Text for tooltip
  mutate(text = paste("Country: ", country, "\nPopulation (M): ", population, "\nLife Expectancy: ", life_expectancy, "\nIncome: ", income, sep="")) %>%
# Creating the plot
  ggplot( aes(x=income, y=life_expectancy, size = population, color = continent, text=text)) +
    geom_point(aes(size = population), alpha = .5) + 
    scale_y_continuous(
      limits = c(50, 90),
      breaks = c(50, 60, 70, 80, 90)) +
    scale_x_log10(
      limits = c(400, 120000),
      breaks = c(400, 4000, 40000)) +
    scale_colour_brewer(palette = "Set1") +  
    theme_classic() +
    theme(legend.position="none") +
  labs(title = "Income versus Life Expectancy in 2018",
       x = "Income (GDP per capita in USD $)",
       y = "Life expectancy (years)",
       caption = "Source: Gapminder")

# Using plotly to make the plot interactive and show each country information with the mouse over.
int2018 <- ggplotly(interactive, tooltip="text")

int2018